{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. import 工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#首先 import 必要的模块\n",
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2. 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148</td>\n",
       "      <td>72</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85</td>\n",
       "      <td>66</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183</td>\n",
       "      <td>64</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89</td>\n",
       "      <td>66</td>\n",
       "      <td>23</td>\n",
       "      <td>94</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137</td>\n",
       "      <td>40</td>\n",
       "      <td>35</td>\n",
       "      <td>168</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0          6                           148              72   \n",
       "1          1                            85              66   \n",
       "2          8                           183              64   \n",
       "3          1                            89              66   \n",
       "4          0                           137              40   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin   BMI  \\\n",
       "0                           35              0  33.6   \n",
       "1                           29              0  26.6   \n",
       "2                            0              0  23.3   \n",
       "3                           23             94  28.1   \n",
       "4                           35            168  43.1   \n",
       "\n",
       "   Diabetes_pedigree_function  Age  Target  \n",
       "0                       0.627   50       1  \n",
       "1                       0.351   31       0  \n",
       "2                       0.672   32       1  \n",
       "3                       0.167   21       0  \n",
       "4                       2.288   33       1  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(\"pima-indians-diabetes.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(768, 9)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#检查数据规模 读取测试数据\n",
    "train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 768 entries, 0 to 767\n",
      "Data columns (total 9 columns):\n",
      "pregnants                       768 non-null int64\n",
      "Plasma_glucose_concentration    768 non-null int64\n",
      "blood_pressure                  768 non-null int64\n",
      "Triceps_skin_fold_thickness     768 non-null int64\n",
      "serum_insulin                   768 non-null int64\n",
      "BMI                             768 non-null float64\n",
      "Diabetes_pedigree_function      768 non-null float64\n",
      "Age                             768 non-null int64\n",
      "Target                          768 non-null int64\n",
      "dtypes: float64(2), int64(7)\n",
      "memory usage: 54.1 KB\n"
     ]
    }
   ],
   "source": [
    "##查看数据基本信息\n",
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>3.845052</td>\n",
       "      <td>120.894531</td>\n",
       "      <td>69.105469</td>\n",
       "      <td>20.536458</td>\n",
       "      <td>79.799479</td>\n",
       "      <td>31.992578</td>\n",
       "      <td>0.471876</td>\n",
       "      <td>33.240885</td>\n",
       "      <td>0.348958</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>3.369578</td>\n",
       "      <td>31.972618</td>\n",
       "      <td>19.355807</td>\n",
       "      <td>15.952218</td>\n",
       "      <td>115.244002</td>\n",
       "      <td>7.884160</td>\n",
       "      <td>0.331329</td>\n",
       "      <td>11.760232</td>\n",
       "      <td>0.476951</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.078000</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>99.000000</td>\n",
       "      <td>62.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>27.300000</td>\n",
       "      <td>0.243750</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>117.000000</td>\n",
       "      <td>72.000000</td>\n",
       "      <td>23.000000</td>\n",
       "      <td>30.500000</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>0.372500</td>\n",
       "      <td>29.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>6.000000</td>\n",
       "      <td>140.250000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>127.250000</td>\n",
       "      <td>36.600000</td>\n",
       "      <td>0.626250</td>\n",
       "      <td>41.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>17.000000</td>\n",
       "      <td>199.000000</td>\n",
       "      <td>122.000000</td>\n",
       "      <td>99.000000</td>\n",
       "      <td>846.000000</td>\n",
       "      <td>67.100000</td>\n",
       "      <td>2.420000</td>\n",
       "      <td>81.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "count  768.000000                    768.000000      768.000000   \n",
       "mean     3.845052                    120.894531       69.105469   \n",
       "std      3.369578                     31.972618       19.355807   \n",
       "min      0.000000                      0.000000        0.000000   \n",
       "25%      1.000000                     99.000000       62.000000   \n",
       "50%      3.000000                    117.000000       72.000000   \n",
       "75%      6.000000                    140.250000       80.000000   \n",
       "max     17.000000                    199.000000      122.000000   \n",
       "\n",
       "       Triceps_skin_fold_thickness  serum_insulin         BMI  \\\n",
       "count                   768.000000     768.000000  768.000000   \n",
       "mean                     20.536458      79.799479   31.992578   \n",
       "std                      15.952218     115.244002    7.884160   \n",
       "min                       0.000000       0.000000    0.000000   \n",
       "25%                       0.000000       0.000000   27.300000   \n",
       "50%                      23.000000      30.500000   32.000000   \n",
       "75%                      32.000000     127.250000   36.600000   \n",
       "max                      99.000000     846.000000   67.100000   \n",
       "\n",
       "       Diabetes_pedigree_function         Age      Target  \n",
       "count                  768.000000  768.000000  768.000000  \n",
       "mean                     0.471876   33.240885    0.348958  \n",
       "std                      0.331329   11.760232    0.476951  \n",
       "min                      0.078000   21.000000    0.000000  \n",
       "25%                      0.243750   24.000000    0.000000  \n",
       "50%                      0.372500   29.000000    0.000000  \n",
       "75%                      0.626250   41.000000    1.000000  \n",
       "max                      2.420000   81.000000    1.000000  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#查看数值型特征的基本统计量\n",
    "train.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从结果中我们可以看到很多列的最小值为0。而在一些特定列代表的变量中，0值并没有意义，这就表名该值无效或为缺失值。\n",
    "具体来说，下列变量的最小值为0时数据无意义： 1、血浆葡萄糖浓度 2、舒张压 3、肱三头肌皮褶厚度 4、餐后血清胰岛素 5、体重指数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Plasma_glucose_concentration      5\n",
      "blood_pressure                   35\n",
      "Triceps_skin_fold_thickness     227\n",
      "serum_insulin                   374\n",
      "BMI                              11\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "NaN_col_names = ['Plasma_glucose_concentration','blood_pressure','Triceps_skin_fold_thickness','serum_insulin','BMI']\n",
    "print((train[NaN_col_names] <= 0).sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "第1、2、5列中0值较少；相比较而言，第3、4列中的0值多出数倍，接近总量的一半。 为了确保有足够的数据量来训练模型，针对不同的列需要有不同的缺失值判断策略。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ill_rate(data):\n",
    "    return (data['Target'] > 0).sum() / len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "血浆葡萄糖浓度缺省时患病率为: 0.4\n",
      "血浆葡萄糖浓度有值时患病率为: 0.3486238532110092\n"
     ]
    }
   ],
   "source": [
    "Plasma_glucose_concentration_nan_ill_rate=ill_rate(train[train['Plasma_glucose_concentration']<=0])\n",
    "print('血浆葡萄糖浓度缺省时患病率为:', Plasma_glucose_concentration_nan_ill_rate)\n",
    "\n",
    "Plasma_glucose_concentration_ill_rate=ill_rate(train[train['Plasma_glucose_concentration']>0])\n",
    "print('血浆葡萄糖浓度有值时患病率为:',Plasma_glucose_concentration_ill_rate)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "以上可知：第1列缺省值的样本数只占总样本数的比例才0.6%，占比很小，且是否缺省对患病率影响不大(±5.13761467889908%)，可直接丢弃。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "舒张压缺省时患病率为: 0.45714285714285713\n",
      "舒张压有值时患病率为: 0.34379263301500684\n"
     ]
    }
   ],
   "source": [
    "blood_pressure_nan_ill_rate=ill_rate(train[train['blood_pressure']<=0])\n",
    "print('舒张压缺省时患病率为:', blood_pressure_nan_ill_rate)\n",
    "\n",
    "blood_pressure_ill_rate=ill_rate(train[train['blood_pressure']>0])\n",
    "print('舒张压有值时患病率为:',blood_pressure_ill_rate)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "以上可知：第2列中0值较少，是否缺省对患病率稍有影响(±11.335022412785029%)，用中值填充缺省值。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "体重指数缺省时患病率为: 0.18181818181818182\n",
      "体重指数有值时患病率为: 0.3513870541611625\n"
     ]
    }
   ],
   "source": [
    "BMI_nan_ill_rate=ill_rate(train[train['BMI']<=0])\n",
    "print('体重指数缺省时患病率为:', BMI_nan_ill_rate)\n",
    "\n",
    "BMI_ill_rate=ill_rate(train[train['BMI']>0])\n",
    "print('体重指数有值时患病率为:',BMI_ill_rate)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "以上可知：第5列中0值较少，是否缺省对患病率影响较大(±16.956887234298068%)，新增BMI_Missing\"表示是否缺省值。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "肱三头肌皮褶厚度缺省时患病率为: 0.3876651982378855\n",
      "肱三头肌皮褶厚度有值时患病率为: 0.33271719038817005\n"
     ]
    }
   ],
   "source": [
    "Triceps_skin_fold_thickness_nan_ill_rate=ill_rate(train[train['Triceps_skin_fold_thickness']<=0])\n",
    "print('肱三头肌皮褶厚度缺省时患病率为:', Triceps_skin_fold_thickness_nan_ill_rate)\n",
    "\n",
    "Triceps_skin_fold_thickness_ill_rate=ill_rate(train[train['Triceps_skin_fold_thickness']>0])\n",
    "print('肱三头肌皮褶厚度有值时患病率为:',Triceps_skin_fold_thickness_ill_rate)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "以上可知：第3列中0值的样本数占比很大，是否缺省对患病率影响较不大(±5.494800784971545%)，指标比较集中，简单用众数填充缺省值。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "餐后血清胰岛素缺省时患病率为: 0.3689839572192513\n",
      "餐后血清胰岛素有值时患病率为: 0.3299492385786802\n"
     ]
    }
   ],
   "source": [
    "serum_insulin_nan_ill_rate=ill_rate(train[train['serum_insulin']<=0])\n",
    "print('餐后血清胰岛素缺省时患病率为:', serum_insulin_nan_ill_rate)\n",
    "\n",
    "serum_insulin_ill_rate=ill_rate(train[train['serum_insulin']>0])\n",
    "print('餐后血清胰岛素有值时患病率为:',serum_insulin_ill_rate)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "以上可知：第4列中0值的样本数占比很大，是否缺省对患病率影响较较小(±3.90347186405711%)，指标比较分散，简单用均值填充缺省值。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pregnants                       False\n",
       "Plasma_glucose_concentration    False\n",
       "blood_pressure                  False\n",
       "Triceps_skin_fold_thickness     False\n",
       "serum_insulin                   False\n",
       "BMI                             False\n",
       "Diabetes_pedigree_function      False\n",
       "Age                             False\n",
       "Target                          False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#丢弃Plasma_glucose_concentration=0的5个样本\n",
    "train=train[train['Plasma_glucose_concentration']>0]\n",
    "\n",
    "#用填补法填充缺省值\n",
    "from sklearn.impute import SimpleImputer\n",
    "\n",
    "def rf(data,column,column_names,_strategy):\n",
    "    data[column]=data[column].map(lambda x: x if x>0 else np.NaN)\n",
    "    data=SimpleImputer(missing_values=np.NaN, strategy=_strategy).fit_transform(data)\n",
    "    return pd.DataFrame(data,columns=column_names)\n",
    "\n",
    "columns=train.columns\n",
    "\n",
    "train=rf(rf(rf(train,'blood_pressure',columns,'median'),'Triceps_skin_fold_thickness',columns,'most_frequent'),'serum_insulin',columns,'mean')\n",
    "\n",
    "np.isnan(train).any()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Plasma_glucose_concentration     0\n",
      "blood_pressure                   0\n",
      "Triceps_skin_fold_thickness      0\n",
      "serum_insulin                    0\n",
      "BMI                             11\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print((train[NaN_col_names] <= 0).sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "3.数据标准化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>BMI_missing</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.352941</td>\n",
       "      <td>0.670968</td>\n",
       "      <td>0.489796</td>\n",
       "      <td>0.304348</td>\n",
       "      <td>0.170535</td>\n",
       "      <td>0.500745</td>\n",
       "      <td>0.234415</td>\n",
       "      <td>0.483333</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.058824</td>\n",
       "      <td>0.264516</td>\n",
       "      <td>0.428571</td>\n",
       "      <td>0.239130</td>\n",
       "      <td>0.170535</td>\n",
       "      <td>0.396423</td>\n",
       "      <td>0.116567</td>\n",
       "      <td>0.166667</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.470588</td>\n",
       "      <td>0.896774</td>\n",
       "      <td>0.408163</td>\n",
       "      <td>0.271739</td>\n",
       "      <td>0.170535</td>\n",
       "      <td>0.347243</td>\n",
       "      <td>0.253629</td>\n",
       "      <td>0.183333</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.058824</td>\n",
       "      <td>0.290323</td>\n",
       "      <td>0.428571</td>\n",
       "      <td>0.173913</td>\n",
       "      <td>0.096154</td>\n",
       "      <td>0.418778</td>\n",
       "      <td>0.038002</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.163265</td>\n",
       "      <td>0.304348</td>\n",
       "      <td>0.185096</td>\n",
       "      <td>0.642325</td>\n",
       "      <td>0.943638</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0   0.352941                      0.670968        0.489796   \n",
       "1   0.058824                      0.264516        0.428571   \n",
       "2   0.470588                      0.896774        0.408163   \n",
       "3   0.058824                      0.290323        0.428571   \n",
       "4   0.000000                      0.600000        0.163265   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin       BMI  \\\n",
       "0                     0.304348       0.170535  0.500745   \n",
       "1                     0.239130       0.170535  0.396423   \n",
       "2                     0.271739       0.170535  0.347243   \n",
       "3                     0.173913       0.096154  0.418778   \n",
       "4                     0.304348       0.185096  0.642325   \n",
       "\n",
       "   Diabetes_pedigree_function       Age  BMI_missing  Target  \n",
       "0                    0.234415  0.483333            1     1.0  \n",
       "1                    0.116567  0.166667            1     0.0  \n",
       "2                    0.253629  0.183333            1     1.0  \n",
       "3                    0.038002  0.000000            1     0.0  \n",
       "4                    0.943638  0.200000            1     1.0  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#  get labels\n",
    "y_train = train['Target']   \n",
    "X_train = train.drop([\"Target\"], axis=1)\n",
    "\n",
    "#用于保存特征工程之后的结果\n",
    "feat_names = X_train.columns\n",
    "\n",
    "# 数据标准化\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "# 初始化特征的标准化器\n",
    "ss_X = MinMaxScaler()\n",
    "\n",
    "# 分别对训练和测试数据的特征进行规模化特征到一定的范围内\n",
    "X_train = ss_X.fit_transform(X_train)\n",
    "\n",
    "\n",
    "X_train = pd.DataFrame(columns = feat_names, data = X_train)\n",
    "\n",
    "#新增BMI_Missing\"表示体重指数是否缺省值\n",
    "X_train['BMI_missing'] = train['BMI'].apply(lambda x: 1 if x>0 else 0)\n",
    "\n",
    "train = pd.concat([X_train, y_train], axis = 1)\n",
    "\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pregnants                       False\n",
       "Plasma_glucose_concentration    False\n",
       "blood_pressure                  False\n",
       "Triceps_skin_fold_thickness     False\n",
       "serum_insulin                   False\n",
       "BMI                             False\n",
       "Diabetes_pedigree_function      False\n",
       "Age                             False\n",
       "BMI_missing                     False\n",
       "Target                          False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.isnan(train).any()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#存为csv格式\n",
    "train.to_csv('FE_pima-indians-diabetes.csv',index = False,header=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
